from langchain_community.document_loaders import WebBaseLoader
import re

loader = WebBaseLoader("http://www.npc.gov.cn/zgrdw/npc/xinwen/2013-10/26/content_1811773.htm")
doc = loader.load()
pattern = re.compile(r'(\r?\n\s*)+')

with open(file="c:/ai/doc/消费者权益保护法.txt", mode='w', encoding='utf8') as f:
    for d in doc:
        if len(d.page_content.strip()) > 10:
            f.write(pattern.sub('', d.page_content.strip()))
            print('==== ' + d.page_content.strip().replace("\s+", ''))
print('文件写入完成')